library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.6 ✔ purrr 0.3.4
✔ tibble 3.1.7 ✔ dplyr 1.0.9
✔ tidyr 1.2.0 ✔ stringr 1.4.0
✔ readr 2.1.2 ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(modelr)
library(caret)
Loading required package: lattice
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘caret’
The following object is masked from ‘package:purrr’:
lift
savings <- CodeClanData::savings
savings
model_overfit <- lm(savings ~ .,
data = savings)
summary(model_overfit)
Call:
lm(formula = savings ~ ., data = savings)
Residuals:
Min 1Q Median 3Q Max
-32617 -6684 -264 6481 31884
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.050e+04 2.591e+03 7.915 5.81e-15 ***
genderMale -1.049e+03 1.706e+03 -0.615 0.5386
nameAngus 1.531e+03 1.755e+03 0.873 0.3830
nameBert -1.973e+02 1.726e+03 -0.114 0.9090
nameBonnie 1.261e+03 1.809e+03 0.697 0.4859
nameCara -8.515e+02 1.807e+03 -0.471 0.6376
nameCharles 2.419e+03 1.786e+03 1.354 0.1759
nameDonald 1.253e+02 1.823e+03 0.069 0.9452
nameDora -5.149e+02 1.995e+03 -0.258 0.7964
nameEd 1.249e+03 1.693e+03 0.738 0.4606
nameEmmy -1.062e+03 1.816e+03 -0.585 0.5587
nameFlorence 9.245e+02 1.833e+03 0.504 0.6142
nameFreddy 6.287e+02 1.792e+03 0.351 0.7257
nameGilly -1.518e+03 1.764e+03 -0.861 0.3896
nameGord -1.246e+02 1.756e+03 -0.071 0.9435
nameHarry 3.218e+02 1.742e+03 0.185 0.8535
nameHelena 4.456e+02 1.748e+03 0.255 0.7988
nameIndia 9.007e+02 1.785e+03 0.505 0.6139
nameIvan 9.941e+02 1.780e+03 0.559 0.5766
nameJimmy NA NA NA NA
nameJools 6.287e+02 1.772e+03 0.355 0.7228
surnameFraser -1.113e+03 1.673e+03 -0.665 0.5060
surnameGaldie 2.034e+03 1.839e+03 1.106 0.2689
surnameHalcrow -7.546e+02 1.819e+03 -0.415 0.6783
surnameHenderson -3.788e+02 1.751e+03 -0.216 0.8288
surnameIrvine 3.610e+03 1.713e+03 2.108 0.0353 *
surnameJamieson 4.129e+02 1.831e+03 0.226 0.8216
surnameJohnson 4.237e+02 1.826e+03 0.232 0.8165
surnameLaurenceso -3.532e+02 1.839e+03 -0.192 0.8478
surnameLeask 2.301e+03 1.697e+03 1.356 0.1754
surnameManson 1.409e+03 1.670e+03 0.843 0.3992
surnameMowat 9.803e+02 1.699e+03 0.577 0.5641
surnameNicolson -1.051e+03 1.816e+03 -0.578 0.5631
surnamePeterson 3.189e+03 1.706e+03 1.869 0.0618 .
surnameRobertson 2.960e+03 1.650e+03 1.794 0.0731 .
surnameSinclair 4.578e+02 1.790e+03 0.256 0.7982
surnameSmith 6.355e+02 1.707e+03 0.372 0.7098
surnameTait 2.379e+03 1.721e+03 1.382 0.1673
surnameThomason 2.579e+03 1.835e+03 1.405 0.1603
surnameWilliamson 3.408e+03 1.780e+03 1.914 0.0558 .
job_areaLegal 2.798e+02 9.491e+02 0.295 0.7682
job_areaProduct Management 2.396e+02 9.720e+02 0.247 0.8053
job_areaSales 7.254e+01 9.458e+02 0.077 0.9389
job_areaTraining 6.860e+02 9.638e+02 0.712 0.4767
salary 3.312e-01 6.586e-02 5.028 5.75e-07 ***
age 2.473e+02 2.492e+01 9.924 < 2e-16 ***
retiredYes -2.120e+04 1.091e+03 -19.425 < 2e-16 ***
locationEdinburgh -3.907e+02 1.230e+03 -0.318 0.7509
locationGlasgow 9.128e+01 1.200e+03 0.076 0.9394
locationInverness 4.338e+02 1.194e+03 0.363 0.7165
locationOrkney -1.550e+03 1.223e+03 -1.267 0.2053
locationShetland -6.597e+02 1.201e+03 -0.549 0.5829
locationStirling 1.018e+02 1.250e+03 0.081 0.9351
locationWestern Isles -1.211e+03 1.214e+03 -0.998 0.3186
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9939 on 1147 degrees of freedom
Multiple R-squared: 0.3345, Adjusted R-squared: 0.3044
F-statistic: 11.09 on 52 and 1147 DF, p-value: < 2.2e-16
plot(model_overfit)
model_wellfit <- lm(savings ~ salary + age + retired,
data = savings)
summary(model_wellfit)
Call:
lm(formula = savings ~ salary + age + retired, data = savings)
Residuals:
Min 1Q Median 3Q Max
-31330.5 -6764.2 147.7 6625.2 30518.5
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.125e+04 1.659e+03 12.812 < 2e-16 ***
salary 3.376e-01 6.443e-02 5.239 1.9e-07 ***
age 2.463e+02 2.432e+01 10.127 < 2e-16 ***
retiredYes -2.102e+04 1.066e+03 -19.710 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9918 on 1196 degrees of freedom
Multiple R-squared: 0.309, Adjusted R-squared: 0.3073
F-statistic: 178.3 on 3 and 1196 DF, p-value: < 2.2e-16
plot(model_wellfit)
model_underfit <- lm(savings ~ salary,
data = savings)
summary(model_underfit)
Call:
lm(formula = savings ~ salary, data = savings)
Residuals:
Min 1Q Median 3Q Max
-36246 -7847 62 8127 37386
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.701e+04 1.547e+03 17.459 < 2e-16 ***
salary 3.466e-01 7.679e-02 4.514 6.99e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 11820 on 1198 degrees of freedom
Multiple R-squared: 0.01672, Adjusted R-squared: 0.0159
F-statistic: 20.38 on 1 and 1198 DF, p-value: 6.992e-06
plot(model_underfit)
pull out the adjusted \(r^2\) from the models
summary(model_overfit)$adj.r.squared
[1] 0.304378
summary(model_wellfit)$adj.r.squared
[1] 0.3072727
summary(model_underfit)$adj.r.squared
[1] 0.01590247
AIC(model_overfit)
[1] 25549.42
AIC(model_wellfit)
[1] 25496.62
AIC(model_underfit)
[1] 25915.93
BIC(model_overfit)
[1] 25824.29
BIC(model_wellfit)
[1] 25522.07
BIC(model_underfit)
[1] 25931.2
put model into tidy data
summary(model_overfit)
Call:
lm(formula = savings ~ ., data = savings)
Residuals:
Min 1Q Median 3Q Max
-32617 -6684 -264 6481 31884
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.050e+04 2.591e+03 7.915 5.81e-15 ***
genderMale -1.049e+03 1.706e+03 -0.615 0.5386
nameAngus 1.531e+03 1.755e+03 0.873 0.3830
nameBert -1.973e+02 1.726e+03 -0.114 0.9090
nameBonnie 1.261e+03 1.809e+03 0.697 0.4859
nameCara -8.515e+02 1.807e+03 -0.471 0.6376
nameCharles 2.419e+03 1.786e+03 1.354 0.1759
nameDonald 1.253e+02 1.823e+03 0.069 0.9452
nameDora -5.149e+02 1.995e+03 -0.258 0.7964
nameEd 1.249e+03 1.693e+03 0.738 0.4606
nameEmmy -1.062e+03 1.816e+03 -0.585 0.5587
nameFlorence 9.245e+02 1.833e+03 0.504 0.6142
nameFreddy 6.287e+02 1.792e+03 0.351 0.7257
nameGilly -1.518e+03 1.764e+03 -0.861 0.3896
nameGord -1.246e+02 1.756e+03 -0.071 0.9435
nameHarry 3.218e+02 1.742e+03 0.185 0.8535
nameHelena 4.456e+02 1.748e+03 0.255 0.7988
nameIndia 9.007e+02 1.785e+03 0.505 0.6139
nameIvan 9.941e+02 1.780e+03 0.559 0.5766
nameJimmy NA NA NA NA
nameJools 6.287e+02 1.772e+03 0.355 0.7228
surnameFraser -1.113e+03 1.673e+03 -0.665 0.5060
surnameGaldie 2.034e+03 1.839e+03 1.106 0.2689
surnameHalcrow -7.546e+02 1.819e+03 -0.415 0.6783
surnameHenderson -3.788e+02 1.751e+03 -0.216 0.8288
surnameIrvine 3.610e+03 1.713e+03 2.108 0.0353 *
surnameJamieson 4.129e+02 1.831e+03 0.226 0.8216
surnameJohnson 4.237e+02 1.826e+03 0.232 0.8165
surnameLaurenceso -3.532e+02 1.839e+03 -0.192 0.8478
surnameLeask 2.301e+03 1.697e+03 1.356 0.1754
surnameManson 1.409e+03 1.670e+03 0.843 0.3992
surnameMowat 9.803e+02 1.699e+03 0.577 0.5641
surnameNicolson -1.051e+03 1.816e+03 -0.578 0.5631
surnamePeterson 3.189e+03 1.706e+03 1.869 0.0618 .
surnameRobertson 2.960e+03 1.650e+03 1.794 0.0731 .
surnameSinclair 4.578e+02 1.790e+03 0.256 0.7982
surnameSmith 6.355e+02 1.707e+03 0.372 0.7098
surnameTait 2.379e+03 1.721e+03 1.382 0.1673
surnameThomason 2.579e+03 1.835e+03 1.405 0.1603
surnameWilliamson 3.408e+03 1.780e+03 1.914 0.0558 .
job_areaLegal 2.798e+02 9.491e+02 0.295 0.7682
job_areaProduct Management 2.396e+02 9.720e+02 0.247 0.8053
job_areaSales 7.254e+01 9.458e+02 0.077 0.9389
job_areaTraining 6.860e+02 9.638e+02 0.712 0.4767
salary 3.312e-01 6.586e-02 5.028 5.75e-07 ***
age 2.473e+02 2.492e+01 9.924 < 2e-16 ***
retiredYes -2.120e+04 1.091e+03 -19.425 < 2e-16 ***
locationEdinburgh -3.907e+02 1.230e+03 -0.318 0.7509
locationGlasgow 9.128e+01 1.200e+03 0.076 0.9394
locationInverness 4.338e+02 1.194e+03 0.363 0.7165
locationOrkney -1.550e+03 1.223e+03 -1.267 0.2053
locationShetland -6.597e+02 1.201e+03 -0.549 0.5829
locationStirling 1.018e+02 1.250e+03 0.081 0.9351
locationWestern Isles -1.211e+03 1.214e+03 -0.998 0.3186
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9939 on 1147 degrees of freedom
Multiple R-squared: 0.3345, Adjusted R-squared: 0.3044
F-statistic: 11.09 on 52 and 1147 DF, p-value: < 2.2e-16
broom::glance(model_overfit)
Get our test and train sets
Fit a model to the TRAINING set
calculate the mean squared error
predictions_test <- predictions_test %>%
mutate(sq_err = (pred - savings)^2)
mse_test <- mean(predictions_test$sq_err)
mse_test # normally this would be sqrt'd -> RMSE
[1] 106665166
sqrt(mse_test)
[1] 10327.88
predictions_train <- train %>%
add_predictions(model) %>%
select(savings, pred)
predictions_train
predictions_train <- predictions_train %>%
mutate(sq_err = (pred - savings) ^ 2)
mse_train <- mean(predictions_train$sq_err)
mse_train
[1] 96034515
sqrt(mse_train)
[1] 9799.72
model_cv
Linear Regression
1200 samples
3 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1080, 1080, 1080, 1080, 1080, 1080, ...
Resampling results:
RMSE Rsquared MAE
9896.834 0.3152612 7902.707
Tuning parameter 'intercept' was held constant at a value of TRUE
model_cv$pred
model_cv$resample
mean(model_cv$resample$RMSE)
[1] 9896.834
model_cv_all
Linear Regression
1200 samples
8 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1080, 1080, 1080, 1080, 1080, 1080, ...
Resampling results:
RMSE Rsquared MAE
10194.96 0.2730269 8120.347
Tuning parameter 'intercept' was held constant at a value of TRUE
model_cv_all$resample %>%
ggplot(aes(x = Resample,
y = RMSE))+
geom_col()
mean(model_cv_all$resample$Rsquared)
[1] 0.2730269
mean(model_cv_all$resample$RMSE)
[1] 10194.96
test: 20%
train: 60%
validation: 20%
fit several models with varying hyperparameters
find the best combination of hyperparams for each type of model
use the validation set to choose the hyperparams
re-train model on entire training set (train + validation)
We can prevent data leakage by ensuring our pre-processing is done in the training dataset separately from our testing/validation set, as well as ensuring the variables we select are useful predictors that would be available to us at the point that we want to apply our model